/*****************************************************************************
*
* Copyright 2019 NXP
* All Rights Reserved
*
*****************************************************************************
*
* THIS SOFTWARE IS PROVIDED BY NXP "AS IS" AND ANY EXPRESSED OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL NXP OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*
****************************************************************************/

/****************************************************************************
* Includes
****************************************************************************/

#ifdef APEX2_EMULATE 
  #include <tchar.h>
  #include "apu_lib.hpp"
  #include "apu_extras.hpp"
  #include "acf_lib.hpp"
  using namespace APEX2;
#else
  #include <stdio.h>
  #include "apex.h"
#endif

#include <umat.hpp>
#include <oal.h>

#include <string.h>
#include "common_time_measure.h"

//using namespace std;
#include "common_stringify_macros.h"
#include "apu_add_process.h"

#ifdef APEX2_EMULATE
  const int TILE_SIZE_X = 8;
  const int TILE_SIZE_Y = 8;
#endif

/****************************************************************************
* Function declarations
****************************************************************************/
int pixel_a1_ref(vsdk::UMat lInput0, vsdk::UMat lInput1, vsdk::UMat lOutput0);
bool compare(vsdk::UMat lOutput0, vsdk::UMat lOutput1, vsdk::UMat lOutput2);

/****************************************************************************
* Main function
****************************************************************************/
int main(int, char**)
{
  // This is needed only for the Target Compiler
  // HW and resources init
  APEX_Init();

  int lSrcWidth = 256;
  int lSrcHeight = 256;

  // Allocate the input and output buffers
  vsdk::UMat lInput0(lSrcHeight, lSrcWidth, VSDK_CV_8UC1);
  vsdk::UMat lInput1(lSrcHeight, lSrcWidth, VSDK_CV_8UC1);
  vsdk::UMat lOutput0_REF(lSrcHeight, lSrcWidth, VSDK_CV_16UC1);

  
  #ifdef DEF_TIME_MEASURE 
    // to be called, if the run time has to be measured
    int64_t ticks0 = FSL_Ticks();
  #endif

  int lRetVal = 0;
  
  // Configure two processes which shall run in parallel, each on one of the two APEX devices.
  CTemplate_Demo_Proc addProcess0(0);
  CTemplate_Demo_Proc addProcess1(1);
  lRetVal |= addProcess0.InitData(lInput0, lInput1);
  lRetVal |= addProcess1.InitData(lInput0, lInput1);

  vsdk::UMat lOutput0_ACF1 = addProcess0.lOutput;
  vsdk::UMat lOutput0_ACF2 = addProcess1.lOutput;

  // Control output
  printf("Data on: \n");
  printf("   Input 1           (%dx%d bytes) at %p\n", lSrcWidth, lSrcHeight, lInput0.u->handle);
  printf("   Input 2           (%dx%d bytes) at %p\n", lSrcWidth, lSrcHeight, lInput1.u->handle);
  printf("   Output APEX 1     (%dx%d words) at %p\n", lSrcWidth, lSrcHeight, lOutput0_ACF1.u->handle);
  printf("   Output APEX 2     (%dx%d words) at %p\n", lSrcWidth, lSrcHeight, lOutput0_ACF2.u->handle);

  if (lRetVal)
    return lRetVal;
  
  // Connect IOs to the APEX processes
  printf("Initialize APEX process n. 1.\n");
  lRetVal |= addProcess0.Connect();

  printf("Initialize APEX process n. 2.\n");
  lRetVal |= addProcess1.Connect();

  if (lRetVal) 
    return lRetVal;
  
  // Check the inputs and if everything ok, let's start the processes
  if (!lInput0.empty() && !lInput1.empty() && !lOutput0_ACF1.empty() && !lOutput0_ACF2.empty())
  {
    for (unsigned int i = 0; i < 5; ++i)
    {
      printf("Iteration %d\n", i);
      printf("=======================================\n");

      printf("Input random seed.\n");
     
      // Fill the Mats with the random data - note the {} around causing destruction of the Mat instances thus flushing the cache
      {
        vsdk::Mat lInput0Mat = lInput0.getMat(OAL_USAGE_CACHED);
        vsdk::Mat lInput1Mat = lInput1.getMat(OAL_USAGE_CACHED);
        int size = lSrcWidth*lSrcHeight;
        for (int j = 0; j < size; ++j)
        {
          lInput0Mat.at<uint8_t>(j) = rand() % 256;
          lInput1Mat.at<uint8_t>(j) = rand() % 256;
        }
      }
      
      // Compute reference
      printf("Compute reference value via SW.\n");      
      lRetVal |= pixel_a1_ref(lInput0, lInput1, lOutput0_REF);

      // Compute on APEXes
      printf("Start both APEXes.\n");
      int ApuRuntimeStart = FSL_Ticks();
      
      lRetVal |= addProcess0.Start();
      lRetVal |= addProcess1.Start();

      printf("Wait for both done.\n");
      lRetVal |= addProcess0.Wait();
      lRetVal |= addProcess1.Wait();
      
      int ApuRuntimeStop = FSL_Ticks();
      printf("%.6f sec and %d ticks ApuRUNTIME, \n", (float)FSL_TicksToSeconds(ApuRuntimeStop - ApuRuntimeStart), ApuRuntimeStop - ApuRuntimeStart);

      if (lRetVal)
        return lRetVal;
  
      // Compare the results
      printf("Comparison: ");
      if (0 == lRetVal)
      {
        if (compare(lOutput0_REF, lOutput0_ACF1, lOutput0_ACF2))
          lRetVal = 0;
        else
          lRetVal += 1;
      }

      if (0 == lRetVal) {
          printf("All three outputs match.\n\n");
      }
      else
      {
        printf("Data do not match.\n\n");
        return lRetVal;
      }
    }
  } 
  else
  {
    lRetVal = 1;
  }

  #ifdef DEF_TIME_MEASURE
    int64_t ticks1 = FSL_Ticks();
    int64_t cpu_ticks = ticks1 - ticks0;
    int64_t apex_ticks = total_apex_ticks;
    int64_t data_load_ticks = total_load_ticks;
    int64_t arm_ticks = cpu_ticks - apex_ticks/2 - data_load_ticks;
    double armTimeInSecs = FSL_TicksToSeconds(arm_ticks);
    double dataLoadTimeInSecs = FSL_TicksToSeconds(data_load_ticks);
    double apexTimeInSecs = FSL_TicksToSeconds(apex_ticks);
    double cpuTimeInSecs = FSL_TicksToSeconds(cpu_ticks);

    printf("\n___________________________________________________\n Processes took \n");
    printf("\t%.6f sec and %ld ticks on ARM, \n", (float)armTimeInSecs, arm_ticks);
    
    printf("\t%.6f sec and %ld ticks on APEX, \n", (float)apexTimeInSecs/2, apex_ticks/2); // two APEX-processes are executing in parallel

    printf("\t%.6f sec and %ld ticks for data loading and \n", (float)dataLoadTimeInSecs, data_load_ticks);

    printf("\t%.6f sec and %ld ticks in total\n \
          \n___________________________________________________\n", (float) cpuTimeInSecs, cpu_ticks);
  #else 
    printf("\n____________________________________________________\n"
          "Processes finished"
          "\n____________________________________________________\n");
  #endif

  return 0;
}

int pixel_a1_ref(vsdk::UMat lInput0, vsdk::UMat lInput1, vsdk::UMat lOutput0)
{
  vsdk::Mat lpInput0Mat = lInput0.getMat(OAL_USAGE_CACHED);
  vsdk::Mat lpInput1Mat = lInput1.getMat(OAL_USAGE_CACHED);
  vsdk::Mat lpOutput0Mat = lOutput0.getMat(OAL_USAGE_CACHED);
  
  for (int i = 0; i<lpInput0Mat.rows; i++)
  {
    for (int j = 0; j<lpInput0Mat.cols; j++)
    {
      lpOutput0Mat.at<unsigned short>(i, j) = lpInput0Mat.at<unsigned char>(i, j) + lpInput1Mat.at<unsigned char>(i, j);
    }
  }
  
  return 0;
}

bool compare(vsdk::UMat lOutput0, vsdk::UMat lOutput1, vsdk::UMat lOutput2)
{
  vsdk::Mat lpOutput0Mat = lOutput0.getMat(OAL_USAGE_CACHED);
  vsdk::Mat lpOutput1Mat = lOutput1.getMat(OAL_USAGE_CACHED);
  vsdk::Mat lpOutput2Mat = lOutput2.getMat(OAL_USAGE_CACHED);
  
  for (int i = 0; i<lpOutput0Mat.rows; i++)
  {
    for (int j = 0; j<lpOutput0Mat.cols; j++)
    {
      if (lpOutput0Mat.at<uchar>(i, j) != lpOutput1Mat.at<uchar>(i, j) ||
          lpOutput0Mat.at<uchar>(i, j) != lpOutput2Mat.at<uchar>(i, j))
        return false;
    }
  }
  
  return true;
}
